#!/usr/bin/perl

use POSIX;
use strict;
use Cwd 'abs_path';
use List::Util qw(min max);
use Statistics::Descriptive;
use Getopt::Long;
use Bio::AlignIO;
use Bio::Tools::GFF;
use Bio::SeqFeatureI;

use lib './scripts';
use ConservatoryUtils;

my $lociListFileName;
my $networkName="myNetwork";
my $conservatoryDir=abs_path(".");
my $genomeDatabaseFileName = "genome_database.csv";
my $help=0;
my $verbose=0;
my $force=0;

GetOptions ("conservatory-directory=s" => \$conservatoryDir,
		    "genome-database=s" => \$genomeDatabaseFileName,
			"loci-list=s" => \$lociListFileName,
			"name=s" => \$networkName,
			"verbose" => \$verbose,
			"force" => \$force,
			"help" => \$help) or die ("Error in command line arguments\n");
			
			
if($help || $lociListFileName eq "") {
   print "buildCNSNetwork --loci-list <File name containing list of genes>\n\n\n";
   print "\t--conservatory-directory\t\tPath of the main conservatory directory.\n";
   print "\t--loci-list\t\t\tFile containing list of genes (REQUIRED).\n";
   print "\t--name\t\t\tName prefix for output files.\n";
   print "\t--force\t\t\tRecreate merged CNS file.\n";
   print "\t--verbose\t\t\tPrint extra progress information.\n";
   print "\t--help\t\t\tShow this message.\n";

   die;
}


#### Read the genome database file
my %genomeDatabase;
my $genomedbFile = "$conservatoryDir/$genomeDatabaseFileName";
my $tmpDir = "$conservatoryDir/tmp/";

open (my $genomeDatabase, "<", $genomedbFile);
while(my $curgenomeline = <$genomeDatabase>) {
	if(substr($curgenomeline,0,1) ne "#")  {
		chomp($curgenomeline);
		my ($curgenomeName, $curgenomeSpecies,  $curgenomeFamily, $curgenomeReference, $upstreamLength, $downstreamLength, $geneNameField, $geneProcessingRegEx, $gene2SpeciesIdentifier, $proteinProcessingRegEx,$classification) = split /,/, $curgenomeline;
		$genomeDatabase{$curgenomeSpecies}{'Family'} = $curgenomeFamily;
		$genomeDatabase{$curgenomeSpecies}{'Classification'} = $classification;	
	}
}

###### First, assemble the CNS and mapping

my $allCNSMergedFileName = "$conservatoryDir/output/conservatoryV7.final.cns.csv";
my $allPositionsMergedFileName = "$conservatoryDir/output/conservatoryV7.final.map.csv";

my $myLociPositionsFileName = "$conservatoryDir/output/$networkName.map.csv";


#### Now filter just for the loci we want
print localtime(). ": Filter loci...\n";
system("cut -d',' -f1 $conservatoryDir/$lociListFileName | sed 's/.*-.*-//' | grep -f - $allPositionsMergedFileName | cut -d',' -f1 | grep -f - $allPositionsMergedFileName > $myLociPositionsFileName.tmp");
system("cut -d',' -f1 $conservatoryDir/$lociListFileName | sed 's/.*-.*-//' | grep -f - output/conservatoryOrthologs.csv | cut -d',' -f1 | sort | uniq | grep -f - $myLociPositionsFileName.tmp > $myLociPositionsFileName");
unlink("$myLociPositionsFileName.tmp");
### Build the CNS dictionary hash
my %cnsDic;
my %nodesDic;

my %nodeCNSHistory;
open(my $allCNSFile, $allCNSMergedFileName) || die ("Internal error");

while(<$allCNSFile>) {
	chomp;
	my ($referenceGenome, $CNSID, $locus, $start, $length, $level, $supportingSpecies, $ancesteralSeq) = split /,/;

	$cnsDic{$CNSID} = {
				'RefGenome' => $referenceGenome,
				'CNSID' => $CNSID,
				'Locus' => $locus,
				'Start' => $start,
				'Len' => $length,
				'Level' => $level,
				'SupportingSpecies' => $supportingSpecies,
				'AncesteralSeq' => $ancesteralSeq,
	};
	
	$nodesDic{$CNSID}{'Type'} = "CNS";
	$nodesDic{$CNSID}{'Name'} = $CNSID;
	$nodesDic{$CNSID}{'Species'} = $referenceGenome;
	$nodesDic{$CNSID}{'Family'}="CNS";
	$nodesDic{$CNSID}{'Level'}=$level;
}
close($allCNSFile);

## Now make network sif file
my $outputSIFFileName = "$conservatoryDir/output/$networkName.network.sif";
my $outputDictionaryFileName = "$conservatoryDir/output/$networkName.dic.csv";

open(my $outputSIFFile, ">$outputSIFFileName");
open(my $outputDictionaryFile, ">$outputDictionaryFileName");
my %usedCNS;

open (my $nodeFile, $myLociPositionsFileName) || die ("Internal error");
while(<$nodeFile>) {
	chomp;
	my ($CNSID, $targetSpecies, $targetLocus, $targetRelativePosition, $targetStrand, $referenceRelativePosition, $length, $targetSequence, $absChromosome, $absStart, $geneStrand, $name) = split /,/;

	$usedCNS{$CNSID} =1;	

	$nodesDic{$targetLocus}{'Type'} = "Locus";
	$nodesDic{$targetLocus}{'Name'} = geneToGenome($targetLocus);
	$nodesDic{$targetLocus}{'Species'} = $targetSpecies;
	$nodesDic{$targetLocus}{'Family'} = $genomeDatabase{geneToGenome($targetLocus)}{'Family'};
	my @classification = split /-/, $genomeDatabase{geneToGenome($targetLocus)}{'Classification'};

	$nodesDic{$targetLocus}{'Level'}= $classification[3];

	### Remove duplicates
	if(! (defined $nodeCNSHistory{$targetLocus}{$CNSID})) { 
		print $outputSIFFile "$targetLocus has $CNSID\n";
		$nodeCNSHistory{$targetLocus}{$CNSID} = 1;
	}
}


## read the short name
open (my $lociListFile, $lociListFileName) || die ("Internal error");

while(my $lociLine = <$lociListFile>) {
	chomp($lociLine);
	my ($locus,$genename) = split /,/, $lociLine;
	$nodesDic{$locus}{'Type'} = "ReferenceLocus";	
	$nodesDic{$locus}{'Name'} = $genename;
}

#### Now dump dictionary
print $outputDictionaryFile join(",",
				"ID",
				"Type",
				"Name",
				"Species",
				"Family",
				"Level") . "\n";									

foreach my $curNode (keys %nodesDic) {
									
	print $outputDictionaryFile join(",",$curNode,
						  $nodesDic{$curNode}{'Type'},
						  $nodesDic{$curNode}{'Name'},
						  $nodesDic{$curNode}{'Species'},
						  $nodesDic{$curNode}{'Family'},
						  $nodesDic{$curNode}{'Level'}) . "\n";
										
}
close($nodeFile);


close($outputSIFFile);
close($outputDictionaryFile);

